{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Geotypicality\n",
    "\n",
    "A notebook to produce the interactive visualization of authors' geographic usage referenced in Evans and Wilkens, \"Nation, Ethnicity, and the Geography of British Fiction, 1880-1940,\" *Cultural Analytics* (2018). \n",
    "\n",
    "See the article and the [output visualization](https://plot.ly/~mattwilkens/119/british-literary-geography-1880-1940/#/) for additional details."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Imports and parameters\n",
    "%matplotlib inline\n",
    "import warnings\n",
    "warnings.simplefilter(action='ignore', category=FutureWarning)\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import os\n",
    "from   IPython.display import display\n",
    "from   sklearn.decomposition import PCA\n",
    "from   sklearn.metrics.pairwise import paired_euclidean_distances\n",
    "from   sklearn.preprocessing import StandardScaler\n",
    "import plotly.plotly as py\n",
    "import plotly.graph_objs as go\n",
    "\n",
    "# Define colors for use throughout\n",
    "colors = {\n",
    "  'Hathi':     'darkblue', \n",
    "  'Foreign':   'darkred',\n",
    "  'London':    'darkgreen',\n",
    "  'Prominent': 'purple'\n",
    " }\n",
    "\n",
    "# Directories for input and output\n",
    "figDir     = os.path.join('figures', 'geotyp')\n",
    "resultsDir = os.path.join('results', 'geotyp')\n",
    "dataDir    = os.path.join('results', 'publication', 'data')\n",
    "os.makedirs(figDir, exist_ok=True)\n",
    "os.makedirs(resultsDir, exist_ok=True)\n",
    "os.makedirs(dataDir, exist_ok=True)\n",
    "\n",
    "sns.set()\n",
    "sns.set_context('talk')\n",
    "plt.rcParams['figure.figsize'] = (12.0, 8.0)\n",
    "\n",
    "not_in_london = frozenset([    \n",
    "    'Thames',\n",
    "    'Thames River',\n",
    "    'River Thames',\n",
    "    'Kew Gardens'\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Unique HTIDs across all corpora: 10765\n",
      "\n",
      "Volumes per corpus: corpus\n",
      "Foreign        130\n",
      "Hathi        10010\n",
      "London         171\n",
      "Prominent      576\n",
      "Name: htid, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# Read in data\n",
    "data = pd.read_csv(os.path.join(dataDir, 'data.tsv.gz'), sep='\\t', low_memory=False)\n",
    "data['occurs_100k'] = data['occurs']*100000/data['wordcount']\n",
    "\n",
    "# Group the data by corpus and generate subsets to be used below\n",
    "corpus_grouped        = data.groupby('corpus')\n",
    "corpus_gb_grouped     = data.loc[(data.country_short=='GB')].groupby('corpus')\n",
    "corpus_nongb_grouped  = data.loc[(data.country_short!='GB')].groupby('corpus')\n",
    "corpus_london_grouped = data.loc[(data.country_short == 'GB') &\n",
    "                                 ((data.admin_2 == 'Greater London') |\n",
    "                                 (data.text_string.isin(not_in_london)))].groupby('corpus')\n",
    "\n",
    "# Build dataframe that's just the corpus with metadata, not geo.\n",
    "# Much smaller than full dataset, useful for some quickies below\n",
    "metadata = data[['htid', 'corpus','author', 'title', 'pub_date', \n",
    "                 'genre', 'gender', 'white', 'origin_area', 'origin_nation',\n",
    "                 'wordcount'\n",
    "                ]].drop_duplicates()\n",
    "metadata_grouped = metadata.groupby('corpus')\n",
    "\n",
    "print(\"Unique HTIDs across all corpora:\", data.htid.nunique())\n",
    "print(\"\\nVolumes per corpus:\", corpus_grouped.htid.nunique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def auth_means(df, target, context, corpus):\n",
    "    '''\n",
    "    Display authors with value of target quantity near the mean for corpus.\n",
    "    '''\n",
    "    p = df.get_group(corpus)\n",
    "    p = p.merge(metadata[metadata.corpus==corpus], how='left', left_on='author', right_on='author')\n",
    "    p = p[[target, 'weight', 'author', 'pub_date', 'gender', 'white', 'origin_area', 'wordcount']].\\\n",
    "        sort_values(target, ascending=False)\n",
    "    p = p.groupby('author')\n",
    "    output = []\n",
    "    for auth, rec in p:\n",
    "        ethnicity = rec.white.values[0]\n",
    "        weight = rec.weight.mean()\n",
    "        target_value = rec[target].mean()\n",
    "        wc = rec.wordcount.sum()\n",
    "        vols = rec.wordcount.count()\n",
    "        pub_date_avg = rec.pub_date.mean()\n",
    "        output.append((auth, target_value, weight, pub_date_avg, vols, wc, ethnicity, corpus))\n",
    "    p = pd.DataFrame.from_records(output)\n",
    "    p.columns = ['author', target, 'weight', 'pub_date_avg', 'books', 'wordcount', 'ethnicity', 'corpus']\n",
    "    p = p.sort_values(target, ascending=False)\n",
    "    p.to_csv(os.path.join(resultsDir, target+'_by_auth_'+corpus+'.tsv'), sep='\\t')\n",
    "    mean_target = sum(p[target]*p.weight)/p.weight.sum()\n",
    "    print(\"Weighted mean of %s in %s:\"%(target, corpus), round(mean_target, 4))\n",
    "    closest = p.index.get_loc(np.abs(p[target]-mean_target).argmin())\n",
    "    #display(p.iloc[range(closest-context, closest+context)])\n",
    "    return p"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## International fraction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Foreign', 'Hathi', 'London', 'Prominent']\n"
     ]
    }
   ],
   "source": [
    "corpora = list(metadata_grouped.groups.keys())\n",
    "print(corpora)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Weighted mean of intl_frac in Foreign: 0.8212\n",
      "Weighted mean of intl_frac in Hathi: 0.7129\n",
      "Weighted mean of intl_frac in London: 0.433\n",
      "Weighted mean of intl_frac in Prominent: 0.6264\n"
     ]
    }
   ],
   "source": [
    "# Build weighted dataframe\n",
    "output = []\n",
    "for name, group in corpus_grouped:\n",
    "    group_occurs = group.occurs_100k.sum()\n",
    "    group_vols   = group.htid.nunique()\n",
    "    avg_occurs   = group_occurs/group_vols\n",
    "    byvol = group.groupby('author')\n",
    "    for htid, vol in byvol:\n",
    "        weight = vol.occurs_100k.sum()/avg_occurs\n",
    "        ethnicity = vol.white.values[0]\n",
    "        gb_frac = vol.loc[(vol.country_short=='GB')].occurs_100k.sum()/vol.occurs_100k.sum()\n",
    "        output.append((name, htid, ethnicity, gb_frac, weight))\n",
    "df = pd.DataFrame.from_records(output)\n",
    "df.columns = ['corpus', 'author', 'ethnicity', 'gb_frac', 'weight']\n",
    "df['intl_frac'] = 1 - df.gb_frac\n",
    "\n",
    "# Group and run tests\n",
    "intl_frac_authg = df.groupby('corpus')\n",
    "makenew=True\n",
    "for c in corpora:\n",
    "    result = auth_means(intl_frac_authg, 'intl_frac', 10, c)\n",
    "    # Examine Woolf and Joyce\n",
    "    #display(result[result.author.isin(['Woolf, Virginia', 'Joyce, James'])])\n",
    "\n",
    "    # Start building features data for geo-typicality calculation\n",
    "    if makenew:\n",
    "        geotyp = result[['corpus', 'author', 'intl_frac']]\n",
    "        makenew=False\n",
    "    else:\n",
    "        geotyp = geotyp.append(result[['corpus', 'author', 'intl_frac']])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Intensity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Weighted mean of occurs_100k in Foreign: 614.9341\n",
      "Weighted mean of occurs_100k in Hathi: 280.3757\n",
      "Weighted mean of occurs_100k in London: 273.0758\n",
      "Weighted mean of occurs_100k in Prominent: 267.1833\n"
     ]
    }
   ],
   "source": [
    "# Build weighted dataframe\n",
    "output = []\n",
    "for name, group in corpus_grouped:\n",
    "    byvol = group.groupby('author')\n",
    "    for htid, vol in byvol:\n",
    "        weight = vol.htid.nunique()\n",
    "        ethnicity = vol.white.values[0]\n",
    "        intensity = vol.occurs_100k.sum()/vol.htid.nunique()\n",
    "        output.append((name, htid, ethnicity, intensity, weight))\n",
    "df = pd.DataFrame.from_records(output)\n",
    "df.columns = ['corpus', 'author', 'ethnicity', 'occurs_100k', 'weight']\n",
    "\n",
    "# Group and run tests\n",
    "intensity_authg = df.groupby('corpus')\n",
    "makenew = True\n",
    "for c in corpora:\n",
    "    result = auth_means(intensity_authg, 'occurs_100k', 10, c)\n",
    "    # Examine Woolf and Joyce\n",
    "    #display(result[result.author.isin(['Woolf, Virginia', 'Joyce, James'])])\n",
    "    # Add intensity to geotyp frame\n",
    "    if makenew:\n",
    "        temp = result[['author', 'occurs_100k', 'corpus']]\n",
    "        makenew = False\n",
    "    else:\n",
    "        temp = temp.append(result[['author', 'occurs_100k', 'corpus']])\n",
    "geotyp = geotyp.merge(temp, how='left', on=['author', 'corpus'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Specificity: World below city level"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Specificity world-wide, below city level ...\n",
      "Weighted mean of spec_world_subcity in Foreign: 0.1847\n",
      "Weighted mean of spec_world_subcity in Hathi: 0.2448\n",
      "Weighted mean of spec_world_subcity in London: 0.3748\n",
      "Weighted mean of spec_world_subcity in Prominent: 0.2646\n"
     ]
    }
   ],
   "source": [
    "print(\"Specificity world-wide, below city level ...\")\n",
    "\n",
    "# What counts as non-specific\n",
    "nonspec_subcity = ('administrative_area_level_1',\n",
    "           'administrative_area_level_2',\n",
    "           'administrative_area_level_3',\n",
    "           'administrative_area_level_4',\n",
    "           'administrative_area_level_5',\n",
    "           'country',\n",
    "           'continent',\n",
    "           'locality',\n",
    "           'natural_feature'\n",
    "          )\n",
    "\n",
    "output = []\n",
    "for name, group in corpus_grouped:\n",
    "    group_occurs = group.occurs_100k.sum()\n",
    "    group_vols   = group.htid.nunique()\n",
    "    avg_occurs   = group_occurs/group_vols\n",
    "    for volid, vol in group.groupby('author'):\n",
    "        weight = vol.occurs_100k.sum()/avg_occurs\n",
    "        ethnicity = vol.white.values[0]\n",
    "        spec_frac = vol.loc[(~vol.location_type.isin(nonspec_subcity))].occurs_100k.sum()/vol.occurs_100k.sum()\n",
    "        output.append((name, volid, ethnicity, spec_frac, weight))\n",
    "n = pd.DataFrame.from_records(output)\n",
    "n.columns = ['corpus', 'author', 'ethnicity', 'spec_world_subcity', 'weight']\n",
    "\n",
    "# Group and run tests\n",
    "ng = n.groupby('corpus')\n",
    "makenew = True\n",
    "for c in corpora:\n",
    "    result = auth_means(ng, 'spec_world_subcity', 10, c)\n",
    "    # Examine Woolf and Joyce\n",
    "    #display(result[result.author.isin(['Woolf, Virginia', 'Joyce, James'])])\n",
    "    # Add intensity to geotyp frame\n",
    "    if makenew:\n",
    "        temp = result[['author', 'spec_world_subcity', 'corpus']]\n",
    "        makenew = False\n",
    "    else:\n",
    "        temp = temp.append(result[['author', 'spec_world_subcity', 'corpus']])\n",
    "geotyp = geotyp.merge(temp, how='left', on=['author', 'corpus'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Specificity: Non-GB below country level"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Weighted mean of spec_nongb_subcountry in Foreign: 0.5029\n",
      "Weighted mean of spec_nongb_subcountry in Hathi: 0.685\n",
      "Weighted mean of spec_nongb_subcountry in London: 0.6885\n",
      "Weighted mean of spec_nongb_subcountry in Prominent: 0.6678\n"
     ]
    }
   ],
   "source": [
    "# Different def of nonspecific here\n",
    "nonspec_subcountry = (\n",
    "           'country',\n",
    "           'continent',\n",
    "           'natural_feature'\n",
    "          )\n",
    "\n",
    "output = []\n",
    "for name, group in corpus_nongb_grouped:\n",
    "    group_occurs = group.occurs_100k.sum()\n",
    "    group_vols   = group.htid.nunique()\n",
    "    avg_occurs   = group_occurs/group_vols\n",
    "    byvol = group.groupby('author')\n",
    "    for htid, vol in byvol:\n",
    "        weight = vol.occurs_100k.sum()/avg_occurs\n",
    "        ethnicity = vol.white.values[0]\n",
    "        spec_frac = vol.loc[(~vol.location_type.isin(nonspec_subcountry))].occurs_100k.sum()/vol.occurs_100k.sum()\n",
    "        output.append((name, htid, ethnicity, spec_frac, weight))\n",
    "df = pd.DataFrame.from_records(output)\n",
    "df.columns = ['corpus', 'author', 'ethnicity', 'spec_nongb_subcountry', 'weight']\n",
    "\n",
    "# Group and run tests\n",
    "ng = df.groupby('corpus')\n",
    "makenew = True\n",
    "for c in corpora:\n",
    "    result = auth_means(ng, 'spec_nongb_subcountry', 10, c)\n",
    "    # Examine Woolf and Joyce\n",
    "    #display(result[result.author.isin(['Woolf, Virginia', 'Joyce, James'])])\n",
    "    # Add intensity to geotyp frame\n",
    "    if makenew:\n",
    "        temp = result[['author', 'spec_nongb_subcountry', 'corpus']]\n",
    "        makenew = False\n",
    "    else:\n",
    "        temp = temp.append(result[['author', 'spec_nongb_subcountry', 'corpus']])\n",
    "geotyp = geotyp.merge(temp, how='left', on=['author', 'corpus'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## London as fraction of GB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Weighted mean of london_frac in Foreign: 0.4251\n",
      "Weighted mean of london_frac in Hathi: 0.4041\n",
      "Weighted mean of london_frac in London: 0.6855\n",
      "Weighted mean of london_frac in Prominent: 0.464\n"
     ]
    }
   ],
   "source": [
    "# Add the last feature, London/GB fraction\n",
    "# Build weighted dataframe\n",
    "output = []\n",
    "for name, group in corpus_gb_grouped:\n",
    "    group_occurs = group.occurs_100k.sum()\n",
    "    group_vols   = group.htid.nunique()\n",
    "    avg_occurs   = group_occurs/group_vols\n",
    "    byvol = group.groupby('author')\n",
    "    for htid, vol in byvol:\n",
    "        weight = vol.occurs_100k.sum()/avg_occurs\n",
    "        ethnicity = vol.white.values[0]\n",
    "        london_frac = vol.loc[((vol.admin_2=='Greater London') | (vol.text_string.isin(not_in_london)))].occurs_100k.sum()/vol.occurs_100k.sum()\n",
    "        output.append((name, htid, ethnicity, london_frac, weight))\n",
    "df = pd.DataFrame.from_records(output)\n",
    "df.columns = ['corpus', 'author', 'ethnicity', 'london_frac', 'weight']\n",
    "\n",
    "# Group and run tests\n",
    "ng = df.groupby('corpus')\n",
    "makenew = True\n",
    "for c in corpora:\n",
    "    result = auth_means(ng, 'london_frac', 10, c)\n",
    "    # Examine Woolf and Joyce\n",
    "    #display(result[result.author.isin(['Woolf, Virginia', 'Joyce, James'])])\n",
    "    # Add intensity to geotyp frame\n",
    "    if makenew:\n",
    "        temp = result[['author', 'london_frac', 'corpus']]\n",
    "        makenew = False\n",
    "    else:\n",
    "        temp = temp.append(result[['author', 'london_frac', 'corpus']])\n",
    "geotyp = geotyp.merge(temp, how='left', on=['author', 'corpus'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Book count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Weighted mean of volumes in Foreign: 2.434\n",
      "Weighted mean of volumes in Hathi: 2.1573\n",
      "Weighted mean of volumes in London: 2.1375\n",
      "Weighted mean of volumes in Prominent: 5.6471\n"
     ]
    }
   ],
   "source": [
    "output = []\n",
    "for name, group in corpus_grouped:\n",
    "    byauth = group.groupby('author')\n",
    "    for auth, books in byauth:\n",
    "        vols = books.htid.nunique()\n",
    "        weight = 1\n",
    "        output.append((name, auth, vols, weight))\n",
    "df = pd.DataFrame.from_records(output)\n",
    "df.columns = ['corpus', 'author', 'volumes', 'weight']\n",
    "\n",
    "# Group and run tests\n",
    "ng = df.groupby('corpus')\n",
    "makenew = True\n",
    "for c in corpora:\n",
    "    result = auth_means(ng, 'volumes', 10, c)\n",
    "    # Examine Woolf and Joyce\n",
    "    #display(result[result.author.isin(['Woolf, Virginia', 'Joyce, James'])])\n",
    "    # Add intensity to geotyp frame\n",
    "    if makenew:\n",
    "        temp = result[['author', 'volumes', 'corpus']]\n",
    "        makenew = False\n",
    "    else:\n",
    "        temp = temp.append(result[['author', 'volumes', 'corpus']])\n",
    "geotyp = geotyp.merge(temp, how='left', on=['author', 'corpus'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "geotyp.fillna(value=0, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>intl_frac</th>\n",
       "      <th>occurs_100k</th>\n",
       "      <th>spec_world_subcity</th>\n",
       "      <th>spec_nongb_subcountry</th>\n",
       "      <th>london_frac</th>\n",
       "      <th>volumes</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>4722.000000</td>\n",
       "      <td>4722.000000</td>\n",
       "      <td>4722.000000</td>\n",
       "      <td>4722.000000</td>\n",
       "      <td>4722.000000</td>\n",
       "      <td>4722.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>0.714263</td>\n",
       "      <td>300.160185</td>\n",
       "      <td>0.256358</td>\n",
       "      <td>0.690943</td>\n",
       "      <td>0.415946</td>\n",
       "      <td>2.235493</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>0.222758</td>\n",
       "      <td>220.564654</td>\n",
       "      <td>0.132470</td>\n",
       "      <td>0.155327</td>\n",
       "      <td>0.239297</td>\n",
       "      <td>3.991999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>5.786702</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>0.542860</td>\n",
       "      <td>150.513591</td>\n",
       "      <td>0.164737</td>\n",
       "      <td>0.601760</td>\n",
       "      <td>0.237288</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>0.758628</td>\n",
       "      <td>245.120202</td>\n",
       "      <td>0.232335</td>\n",
       "      <td>0.705882</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>0.918919</td>\n",
       "      <td>384.723195</td>\n",
       "      <td>0.321776</td>\n",
       "      <td>0.795948</td>\n",
       "      <td>0.566570</td>\n",
       "      <td>2.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>2155.349480</td>\n",
       "      <td>0.938776</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>72.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         intl_frac  occurs_100k  spec_world_subcity  spec_nongb_subcountry  \\\n",
       "count  4722.000000  4722.000000         4722.000000            4722.000000   \n",
       "mean      0.714263   300.160185            0.256358               0.690943   \n",
       "std       0.222758   220.564654            0.132470               0.155327   \n",
       "min       0.000000     5.786702            0.000000               0.000000   \n",
       "25%       0.542860   150.513591            0.164737               0.601760   \n",
       "50%       0.758628   245.120202            0.232335               0.705882   \n",
       "75%       0.918919   384.723195            0.321776               0.795948   \n",
       "max       1.000000  2155.349480            0.938776               1.000000   \n",
       "\n",
       "       london_frac      volumes  \n",
       "count  4722.000000  4722.000000  \n",
       "mean      0.415946     2.235493  \n",
       "std       0.239297     3.991999  \n",
       "min       0.000000     1.000000  \n",
       "25%       0.237288     1.000000  \n",
       "50%       0.400000     1.000000  \n",
       "75%       0.566570     2.000000  \n",
       "max       1.000000    72.000000  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "geotyp.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Math"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-2.24508632307e-16 1.0\n",
      "(4722, 5)\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAtEAAAHWCAYAAACxJNUiAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAIABJREFUeJzt3W9wlfWd9/FPQpAkhBpBu1M6LCBx\n0GVFK65/0CkUuoPMIsxY6+xQ/4yMFkY61FWZWq2rW1vHXYtWcaTrjHbrn9YRO9NV1+mi4D5iq1Jm\nZWuV2jY4dlOnBpZUMTEmOfeD3Tv35ha3+ZkDh+Dr9SST6zrh+h35cvnml3BOXaVSqQQAABi2+lov\nAAAARhsRDQAAhUQ0AAAUEtEAAFBIRAMAQCERDQAAhRpqvYAP48033xr2Y+vq6jJp0vjs3r0vXs2P\nkTJPVItZolrMEtVilvbvmGMm7Pf4Yb8TXV//X0NRf9g/Uw4G80S1mCWqxSxRLWapjP9MAABQSEQD\nAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLR\nAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIUaar0A+CArbt1Sk+vef+2CmlwXABg9\n7EQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAA\nFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0A\nAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFCqK6B07duTs\ns88e/LyrqyurV6/OnDlzMn/+/GzcuHHwXG9vb6677rqcdtppmTt3bjZs2DB4rlKpZN26dTnjjDPy\nZ3/2Z/nGN76R/v7+KjwdAAA48IYV0ZVKJY899lhWrFiR9957b/D4DTfckObm5mzdujV33XVXvvWt\nb+WVV15Jktxxxx3p6OjI5s2b8/3vfz8bN27Mli1bkiQPP/xw/uVf/iWPP/54nnrqqWzfvj3f//73\nD8DTAwCA6htWRH/nO9/JAw88kFWrVg0e27dvX5555pmsWbMm48aNy+zZs7NkyZLB3ejHH388K1eu\nzIQJEzJt2rRceOGFefTRR5Mk//iP/5hLLrkkH//4x3PMMcdk5cqVg+cAAOBQ1zCcB33uc5/LqlWr\n8vzzzw8ee+2119LQ0JApU6YMHps+fXo2bdqUrq6udHZ2pq2tbci5hx9+OEny61//+n3nfvnLX6ZS\nqaSuru4Prqeuri71w/xBlPr6uiEf4Q8ZM+aDZ8U8US1miWoxS1SLWSozrIj++Mc//r5j77zzThob\nG4cca2xsTE9PT7q7u5MkTU1N7zuXJN3d3UO+tqmpKQMDA+nt7c24ceP+4HomTRo/rNj+n1pbxxc9\nno+uiRNb/uBjzBPVYpaoFrNEtZil4RlWRO9PU1PTYBT/Xz09PWlubh4M5J6enrS0tAw5l/xXUL/7\n7ruDX9fd3Z2GhoZhBXSS7N69r2gnurV1fPbu3ZeBgcrwvoiPtD173v7Ac+aJajFLVItZolrM0v59\n0Obah47oqVOnpq+vLx0dHZk8eXKSpL29PW1tbWltbc2kSZPS3t6eo48+evDcjBkzkiQzZsxIe3t7\nTjrppMFzxx577LCvXalUUvpiHgMDlfT3Gwj+sOHMiXmiWswS1WKWqBazNDwf+nWiW1pasnDhwqxb\nty7d3d3ZsWNHnnzyyZx77rlJkqVLl2b9+vXZu3dvdu3alYceeijLli0bPHfffffljTfeSGdnZ/7+\n7/9+8BwAABzqPvROdJLcfPPNufHGGzNv3rw0Nzdn7dq1g7vLV155ZW655ZYsXrw4dXV1ufjii7N4\n8eIkyfLly9PZ2Znzzz8/7733Xs4999xceumlI382AABwENRVKpVRt1//5ptvDfuxY8bUZeLEluzZ\n87ZvTYwyK27dUpPr3n/tgg88Z56oFrNEtZglqsUs7d8xx0zY73Fv+w0AAIVENAAAFBLRAABQSEQD\nAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLR\nAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVE\nNAAAFBLRAABQSEQDAEAhEQ0AAIUaar0AONSsuHVLza59/7ULanZtAGD47EQDAEAhEQ0AAIVENAAA\nFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0A\nAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQD\nAEAhEQ0AAIVENAAAFBLRAABQqKHWC+DQtuLWLbVeAgDAIcdONAAAFBLRAABQSEQDAEAhEQ0AAIVE\nNAAAFBLRAABQSEQDAEChEUf09u3bc9555+WUU07JokWL8sQTTyRJurq6snr16syZMyfz58/Pxo0b\nB7+mt7c31113XU477bTMnTs3GzZsGOkyAADgoBnRm6309/dn9erVufHGG3POOedk27ZtueSSS/Kp\nT30qf/d3f5fm5uZs3bo1O3fuzOWXX54TTzwxxx9/fO644450dHRk8+bN2b17d1asWJGZM2dmwYIF\n1XpeAABwwIxoJ/r3v/999uzZk/7+/lQqldTV1WXs2LEZM2ZMnnnmmaxZsybjxo3L7Nmzs2TJksHd\n6McffzwrV67MhAkTMm3atFx44YV59NFHq/KEAADgQBvRTvRRRx2V5cuX56qrrsratWszMDCQb37z\nm/nP//zPNDQ0ZMqUKYOPnT59ejZt2pSurq50dnamra1tyLmHH3542Netq6tL/TDzv76+bshHOJSN\nGWNOPyrcm6gWs0S1mKUyI4rogYGBNDY25s4778yCBQuydevWXH311dmwYUMaGxuHPLaxsTE9PT3p\n7u5OkjQ1Nb3v3HBNmjQ+dXVlv8GtreOLHg+1MHFiS62XwEHm3kS1mCWqxSwNz4gietOmTdmxY0e+\n8pWvJEnmz5+f+fPnZ/369e+L4p6enjQ3Nw/GdU9PT1paWoacG67du/cV7US3to7P3r37MjBQGfY1\noBb27Hm71kvgIHFvolrMEtVilvbvgza4RhTRv/3tb9Pb2zv0F2xoyKxZs/LTn/40HR0dmTx5cpKk\nvb09bW1taW1tzaRJk9Le3p6jjz568NyMGTOGfd1KpZL+/rK1DgxU0t9vIDi0mdGPHvcmqsUsUS1m\naXhG9A8L586dm5dffjk//OEPU6lU8vzzz+fpp5/OX/zFX2ThwoVZt25duru7s2PHjjz55JM599xz\nkyRLly7N+vXrs3fv3uzatSsPPfRQli1bVpUnBAAAB9qIInrmzJm566678sADD2TOnDn5+te/nr/9\n27/NiSeemJtvvjl9fX2ZN29e1qxZk7Vr1+akk05Kklx55ZWZNm1aFi9enOXLl+eCCy7I4sWLq/KE\nAADgQKurVCqjbr/+zTffGvZjx4ypy8SJLdmz523fmvgQVty6pdZL+Ei5/1qvlf5R4d5EtZglqsUs\n7d8xx0zY73Fv+w0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVE\nNAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAh\nEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQ\nSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAA\nFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0A\nAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQD\nAEAhEQ0AAIVGHNFvvPFGVq5cmVNOOSWf/vSn88ADDyRJurq6snr16syZMyfz58/Pxo0bB7+mt7c3\n1113XU477bTMnTs3GzZsGOkyAADgoGkYyRdXKpVcccUVOf3003P33Xdn165d+cIXvpA//dM/zT/8\nwz+kubk5W7duzc6dO3P55ZfnxBNPzPHHH5877rgjHR0d2bx5c3bv3p0VK1Zk5syZWbBgQbWeFwAA\nHDAj2ol+8cUX87vf/S7XXHNNxo4dm+OOOy6PPPJI/uiP/ijPPPNM1qxZk3HjxmX27NlZsmTJ4G70\n448/npUrV2bChAmZNm1aLrzwwjz66KNVeUIAAHCgjWgn+qWXXspxxx2X2267LU888URaWlqyatWq\nzJw5Mw0NDZkyZcrgY6dPn55Nmzalq6srnZ2daWtrG3Lu4YcfHvZ16+rqUj/M/K+vrxvyEQ5lY8aY\n048K9yaqxSxRLWapzIgiuqurK88991zOOOOMPPvss/nZz36Wyy67LPfee28aGxuHPLaxsTE9PT3p\n7u5OkjQ1Nb3v3HBNmjQ+dXVlv8GtreOLHg+1MHFiS62XwEHm3kS1mCWqxSwNz4gi+ogjjsiRRx6Z\nlStXJklOOeWULFq0KHfdddf7orinpyfNzc2Dcd3T05OWlpYh54Zr9+59RTvRra3js3fvvgwMVIZ9\nDaiFPXvervUSOEjcm6gWs0S1mKX9+6ANrhFF9PTp09Pd3Z2+vr40NPzXL9Xf358/+ZM/ybZt29LR\n0ZHJkycnSdrb29PW1pbW1tZMmjQp7e3tOfroowfPzZgxY9jXrVQq6e8vW+vAQCX9/QaCQ5sZ/ehx\nb6JazBLVYpaGZ0T/sPCss87Kxz72saxbty59fX3Zvn17nn766ZxzzjlZuHBh1q1bl+7u7uzYsSNP\nPvlkzj333CTJ0qVLs379+uzduze7du3KQw89lGXLllXlCQEAwIE2oohubGzMgw8+mF/84heZO3du\nrrnmmnzta1/LySefnJtvvjl9fX2ZN29e1qxZk7Vr1+akk05Kklx55ZWZNm1aFi9enOXLl+eCCy7I\n4sWLq/KEAADgQKurVCqjbr/+zTffGvZjx4ypy8SJLdmz523fmvgQVty6pdZL4CC4/1qv0X6wuTdR\nLWaJajFL+3fMMRP2e9zbfgMAQCERDQAAhUQ0AAAUEtEAAFBIRAMAQCERDQAAhUQ0AAAUEtEAAFBI\nRAMAQCERDQAAhUQ0AAAUEtEAAFBIRAMAQCERDQAAhUQ0AAAUEtEAAFBIRAMAQCERDQAAhUQ0AAAU\nEtEAAFBIRAMAQCERDQAAhUQ0AAAUEtEAAFBIRAMAQCERDQAAhUQ0AAAUEtEAAFBIRAMAQCERDQAA\nhUQ0AAAUEtEAAFBIRAMAQCERDQAAhUQ0AAAUEtEAAFBIRAMAQCERDQAAhUQ0AAAUEtEAAFBIRAMA\nQCERDQAAhUQ0AAAUEtEAAFBIRAMAQCERDQAAhUQ0AAAUEtEAAFBIRAMAQCERDQAAhUQ0AAAUEtEA\nAFBIRAMAQCERDQAAhUQ0AAAUEtEAAFBIRAMAQCERDQAAhUQ0AAAUEtEAAFBIRAMAQCERDQAAhUQ0\nAAAUEtEAAFBIRAMAQCERDQAAhaoW0Z2dnTnzzDPz7LPPJkl+85vf5JJLLsmnPvWpLFq0aPB4knR1\ndWX16tWZM2dO5s+fn40bN1ZrGQAAcMBVLaKvv/767N27d/DzL3/5y5k9e3aef/75XHfddbn66quz\nZ8+eJMkNN9yQ5ubmbN26NXfddVe+9a1v5ZVXXqnWUgAA4ICqSkT/4Ac/SFNTUz7xiU8kSX71q1/l\nF7/4RVavXp2xY8dm3rx5Oe200/KjH/0o+/btyzPPPJM1a9Zk3LhxmT17dpYsWWI3GgCAUaNhpL/A\nrl278t3vfjePPvpozjvvvCTJr3/963zyk59MY2Pj4OOmT5+eV199Na+99loaGhoyZcqUIec2bdo0\n7GvW1dWlfpj5X19fN+Qj8H5jxvjzcbC5N1EtZolqMUtlRhTRfX19Wbt2ba6//vq0trYOHn/nnXfS\n1NQ05LGNjY3p6enJO++8MySu/+e54Zo0aXzq6sp+g1tbxxc9Hj5KJk5sqfUSPrLcm6gWs0S1mKXh\nGVFE33PPPTnhhBMyb968IcebmpreF8U9PT1pbm7+X88N1+7d+4p2oltbx2fv3n0ZGKgM+xrwUbJn\nz9u1XsJHjnsT1WKWqBaztH8ftNE0ooh+6qmn8uabb+app55Kkrz99tu56qqrsmrVqvzHf/xHent7\nc8QRRyRJ2tvbc/rpp2fq1Knp6+tLR0dHJk+ePHiura1t2NetVCrp7y9b68BAJf39BgL2x5+N2nFv\nolrMEtViloZnRBH94x//eMjnCxYsyA033JDPfOYz+ed//ud8+9vfzpVXXpl//dd/zXPPPZcbb7wx\nLS0tWbhwYdatW5dvfOMbefXVV/Pkk0/m3nvvHdETAT68Fbduqcl17792QU2uCwAjdcDebGX9+vXZ\nuXNnzjzzzNxyyy25/fbbB1+94+abb05fX1/mzZuXNWvWZO3atTnppJMO1FIAAKCqRvzqHP/Tli3/\nbzfrk5/8ZO677779Pq61tTV33nlnNS8NAAAHjbf9BgCAQiIaAAAKiWgAACgkogEAoJCIBgCAQiIa\nAAAKiWgAACgkogEAoJCIBgCAQiIaAAAKiWgAACgkogEAoJCIBgCAQg21XgDDs+LWLbVeAgAA/81O\nNAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAh\nEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQ\nSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAA\nFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0A\nAIVENAAAFGqo9QKAj64Vt26p2bXvv3ZBza4NwOhnJxoAAAqJaAAAKCSiAQCgkIgGAIBCIhoAAAqN\nOKK3bduWz3/+85kzZ04++9nP5pFHHkmSdHV1ZfXq1ZkzZ07mz5+fjRs3Dn5Nb29vrrvuupx22mmZ\nO3duNmzYMNJlAADAQTOil7jr6urKFVdcka997WtZsmRJXn755Vx66aX54z/+4zzyyCNpbm7O1q1b\ns3Pnzlx++eU58cQTc/zxx+eOO+5IR0dHNm/enN27d2fFihWZOXNmFizwklMAABz6RrQT3dHRkXnz\n5mXp0qWpr6/PrFmzcvrpp2f79u155plnsmbNmowbNy6zZ8/OkiVLBnejH3/88axcuTITJkzItGnT\ncuGFF+bRRx+tyhMCAIADbUQ70SeccEJuu+22wc+7urqybdu2zJw5Mw0NDZkyZcrguenTp2fTpk3p\n6upKZ2dn2trahpx7+OGHh33durq61A8z/+vr64Z8BEiSMWNqe09wb6JazBLVYpbKVO0dC996662s\nWrVqcDf6gQceGHK+sbExPT096e7uTpI0NTW979xwTZo0PnV1Zb/Bra3jix4PHN4mTmyp9RKSuDdR\nPWaJajFLw1OViH799dezatWqTJkyJd/+9rfzq1/96n1R3NPTk+bm5jQ2Ng5+3tLSMuTccO3eva9o\nJ7q1dXz27t2XgYHKsK8BHN727Hm7ptd3b6JazBLVYpb274M2XUYc0S+99FIuu+yyLF26NF/5yldS\nX1+fqVOnpq+vLx0dHZk8eXKSpL29PW1tbWltbc2kSZPS3t6eo48+evDcjBkzhn3NSqWS/v6ydQ4M\nVNLfbyCA/3Ko3A/cm6gWs0S1mKXhGdE/LOzs7Mxll12WSy+9NF/96ldT/9/bwy0tLVm4cGHWrVuX\n7u7u7NixI08++WTOPffcJMnSpUuzfv367N27N7t27cpDDz2UZcuWjfzZAADAQTCinejHHnsse/bs\nyYYNG4a81vPFF1+cm2++OTfeeGPmzZuX5ubmrF27NieddFKS5Morr8wtt9ySxYsXp66uLhdffHEW\nL148smcCAAAHSV2lUhl1+/VvvvnWsB87ZkxdJk5syZ49b4/qb02suHVLrZcAh5X7r63t69IfLvcm\nas8sUS1maf+OOWbCfo97228AACgkogEAoJCIBgCAQiIaAAAKiWgAACgkogEAoJCIBgCAQiIaAAAK\niWgAACgkogEAoJCIBgCAQiIaAAAKiWgAACgkogEAoJCIBgCAQiIaAAAKiWgAACgkogEAoJCIBgCA\nQiIaAAAKiWgAACgkogEAoJCIBgCAQiIaAAAKiWgAACjUUOsFANTCilu31OS691+7oCbXBaC67EQD\nAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIVENAAAFBLR\nAABQSEQDAEAhEQ0AAIVENAAAFBLRAABQSEQDAEAhEQ0AAIUaar0AgI+SFbduqdm17792Qc2uDXC4\nsRMNAACFRDQAABQS0QAAUMjPRBeo5c8yAgBw6LATDQAAhUQ0AAAUEtEAAFBIRAMAQCERDQAAhUQ0\nAAAUEtEAAFDI60QDfETU6rXu7792QU2uC3Ag2YkGAIBCIhoAAAqJaAAAKCSiAQCgkIgGAIBCXp0D\ngAOqVq8KknhlEODAsRMNAACF7EQDcNjy2tjAgWInGgAACtVsJ/rnP/95/vqv/zq//OUvM3Xq1PzN\n3/xNTj755FotBwBGPT9/DgdPTXai33333axatSrnnXdeXnjhhVx00UX50pe+lN7e3losBwAAitRk\nJ/onP/lJ6uvrs3z58iTJ+eefn+9973t59tlns2jRolosCQCqppY7wsDBUZOIbm9vz4wZM4Ycmz59\nel599dVhRXRdXV3qh7mHXl9fN+QjAFB9tfqLw/euX1iT6ybJJd/cXJPrHqjnPJxmOtye80jUJKLf\neeedNDU1DTnW2NiYnp6eYX390Ue3FF+ztXV88df8/55Yt2zEvwYAcHg4XLvgf2umw/U5fxg1+Zno\npqam9wVzT09Pmpuba7EcAAAoUpOIPvbYY9Pe3j7kWHt7e9ra2mqxHAAAKFKTiD7zzDPT29ubBx98\nMO+9914ee+yxdHZ25uyzz67FcgAAoEhdpVKp1OLCr7zySm666abs3LkzU6dOzU033eR1ogEAGBVq\nFtEAADBaedtvAAAoJKIBAKCQiAYAgEKHdUT//Oc/z/nnn5+TTz45y5Yty7/927/VekmMUtu2bcvn\nP//5zJkzJ5/97GfzyCOP1HpJjHKdnZ0588wz8+yzz9Z6KYxib7zxRlauXJlTTjkln/70p/PAAw/U\nekmMUtu3b895552XU045JYsWLcoTTzxR6yUd8g7biH733XezatWqnHfeeXnhhRdy0UUX5Utf+lJ6\ne3trvTRGma6urlxxxRW56KKL8sILL+TOO+/M7bffnq1bt9Z6aYxi119/ffbu3VvrZTCKVSqVXHHF\nFTn22GPz3HPP5b777svdd9+d7du313ppjDL9/f1ZvXp1vvjFL2b79u355je/mWuvvTa/+c1var20\nQ9phG9E/+clPUl9fn+XLl2fs2LE5//zzc9RRR9n1oVhHR0fmzZuXpUuXpr6+PrNmzcrpp5/uf1R8\naD/4wQ/S1NSUT3ziE7VeCqPYiy++mN/97ne55pprMnbs2Bx33HF55JFHMn369FovjVHm97//ffbs\n2ZP+/v5UKpXU1dVl7NixGTNmTK2Xdkg7bCO6vb09M2bMGHJs+vTpefXVV2u0IkarE044Ibfddtvg\n511dXdm2bVuOP/74Gq6K0WrXrl357ne/m5tuuqnWS2GUe+mll3Lcccfltttuy1lnnZVFixblxRdf\nzFFHHVXrpTHKHHXUUVm+fHmuuuqqzJo1K1/4whdyww03+Iv+H9BQ6wUcKO+8806ampqGHGtsbExP\nT0+NVsTh4K233sqqVasya9asLFiwoNbLYZTp6+vL2rVrc/3116e1tbXWy2GU6+rqynPPPZczzjgj\nzz77bH72s5/lsssuy5QpU3LqqafWenmMIgMDA2lsbMydd96ZBQsWZOvWrbn66qsza9YsG0b/i8N2\nJ7qpqel9wdzT05Pm5uYarYjR7vXXX89f/uVf5sgjj8zdd9+d+vrD9o8PB8g999yTE044IfPmzav1\nUjgMHHHEETnyyCOzcuXKHHHEEYP/IGzz5s21XhqjzKZNm7Jjx46cc845OeKIIzJ//vzMnz8/P/rR\nj2q9tEPaYVsBxx57bNrb24cca29vT1tbW41WxGj20ksv5YILLsjZZ5+de+65J42NjbVeEqPQU089\nlX/6p3/KqaeemlNPPTUdHR256qqrcu+999Z6aYxC06dPT3d3d/r6+gaP/d+faYUSv/3tb9/3wgsN\nDQ1paDhsf2ChKg7biD7zzDPT29ubBx98MO+9914ee+yxdHZ25uyzz6710hhlOjs7c9lll+XSSy/N\nV7/6VTvQfGg//vGP89Of/jTbtm3Ltm3bMnny5Nx+++354he/WOulMQqdddZZ+djHPpZ169alr68v\n27dvz9NPP51zzjmn1ktjlJk7d25efvnl/PCHP0ylUsnzzz+fp59+OosWLar10g5pdZXD+K+sr7zy\nSm666abs3LkzU6dOzU033ZSTTz651stilPnOd76TO+64430/CnTxxRfnr/7qr2q0Kg4HCxYsyA03\n3JDPfOYztV4Ko9Rrr72Wr3/96/n3f//3tLS0ZPXq1fnc5z5X62UxCm3ZsiV33nlnXn/99UyePDlf\n/vKX8+d//ue1XtYh7bCOaAAAOBB8XxoAAAqJaAAAKCSiAQCgkIgGAIBCIhoAAAqJaAAAKCSiAQCg\nkIgGAIBCIhoAAAr9H0yTOQwCmayAAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x10cc5c7b8>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "geotyp = geotyp.sort_values(['corpus', 'author'])\n",
    "corpus_list = geotyp.corpus.tolist()\n",
    "author_list = geotyp.author.tolist()\n",
    "bookcounts = geotyp.volumes.tolist()\n",
    "bookcounts = [int(i) for i in bookcounts]\n",
    "most_books = max(bookcounts)\n",
    "typdata = geotyp.iloc[:,2:7].as_matrix()\n",
    "\n",
    "scaler = StandardScaler()\n",
    "typ_scaled = scaler.fit_transform(typdata)\n",
    "print(np.mean(typ_scaled), np.std(typ_scaled))\n",
    "print(typ_scaled.shape)\n",
    "\n",
    "distances = paired_euclidean_distances(typ_scaled, np.zeros(typ_scaled.shape))\n",
    "fig = plt.hist(distances, bins=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Geotypical authors, from most typical to least\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Hathi      Keable, Robert,                      0.243997\n",
       "           Stern, G. B. (Gladys Bronwyn),       0.305803\n",
       "           Hutchison, William G.,               0.310108\n",
       "           Montague, C. E. (Charles Edward),    0.311446\n",
       "           Firbank, Ronald,                     0.331593\n",
       "           Somerville, E. Œ.                    0.338130\n",
       "           Treble, H. A.                        0.345683\n",
       "           Corner, Caroline.                    0.400956\n",
       "Prominent  Firbank, [Arthur Annesley] Ronald    0.401138\n",
       "Hathi      Treble, Henry Arthur.                0.409141\n",
       "dtype: float64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "typ = pd.Series.from_array(distances, index=[corpus_list, author_list])\n",
    "print(\"Geotypical authors, from most typical to least\")\n",
    "typ.sort_values().head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Firbank, [Arthur Annesley] Ronald    0.401138\n",
       "West, Rebecca                        0.468818\n",
       "Aldington, Richard                   0.633435\n",
       "Huxley, Aldous                       0.638704\n",
       "Munro, Hector Hugh                   0.646356\n",
       "Lawrence, D. H.                      0.670664\n",
       "Conrad, Joseph                       0.679241\n",
       "Maugham, William Somerset            0.755554\n",
       "Wilde, Oscar                         0.779494\n",
       "Macaulay, Rose                       0.783740\n",
       "Greene, Graham                       0.794918\n",
       "Ford, Ford Madox                     0.814692\n",
       "Bowen, Elizabeth                     0.833852\n",
       "Bentley, E. C.                       0.840442\n",
       "James, Henry                         0.849743\n",
       "Isherwood, Christopher               0.895694\n",
       "Joyce, James                         0.909991\n",
       "Powell, Anthony                      0.922054\n",
       "Waugh, Evelyn                        0.924258\n",
       "Conan Doyle, Arthur                  0.947908\n",
       "dtype: float64"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "typ.loc['Prominent'].sort_values().head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>intl_frac</th>\n",
       "      <th>occurs_100k</th>\n",
       "      <th>world_subcity</th>\n",
       "      <th>nongb_subcountry</th>\n",
       "      <th>london_frac</th>\n",
       "      <th>distance</th>\n",
       "      <th>volumes</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"8\" valign=\"top\">Hathi</th>\n",
       "      <th>Keable, Robert,</th>\n",
       "      <td>0.216450</td>\n",
       "      <td>-0.034379</td>\n",
       "      <td>-0.076337</td>\n",
       "      <td>-0.029146</td>\n",
       "      <td>-0.069466</td>\n",
       "      <td>0.243997</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Stern, G. B. (Gladys Bronwyn),</th>\n",
       "      <td>-0.139110</td>\n",
       "      <td>-0.103849</td>\n",
       "      <td>0.223005</td>\n",
       "      <td>0.115438</td>\n",
       "      <td>0.017939</td>\n",
       "      <td>0.305803</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Hutchison, William G.,</th>\n",
       "      <td>0.020145</td>\n",
       "      <td>0.164111</td>\n",
       "      <td>-0.189560</td>\n",
       "      <td>0.086310</td>\n",
       "      <td>-0.159519</td>\n",
       "      <td>0.310108</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Montague, C. E. (Charles Edward),</th>\n",
       "      <td>-0.008974</td>\n",
       "      <td>-0.167409</td>\n",
       "      <td>-0.190450</td>\n",
       "      <td>-0.142094</td>\n",
       "      <td>0.111492</td>\n",
       "      <td>0.311446</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Firbank, Ronald,</th>\n",
       "      <td>0.163647</td>\n",
       "      <td>0.208781</td>\n",
       "      <td>-0.023689</td>\n",
       "      <td>0.197401</td>\n",
       "      <td>0.007451</td>\n",
       "      <td>0.331593</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Somerville, E. Œ.</th>\n",
       "      <td>0.130332</td>\n",
       "      <td>-0.004447</td>\n",
       "      <td>0.286396</td>\n",
       "      <td>0.071137</td>\n",
       "      <td>-0.101207</td>\n",
       "      <td>0.338130</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Treble, H. A.</th>\n",
       "      <td>-0.216614</td>\n",
       "      <td>-0.007589</td>\n",
       "      <td>0.084707</td>\n",
       "      <td>-0.124838</td>\n",
       "      <td>-0.223064</td>\n",
       "      <td>0.345683</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Corner, Caroline.</th>\n",
       "      <td>0.365959</td>\n",
       "      <td>0.044048</td>\n",
       "      <td>-0.074586</td>\n",
       "      <td>-0.032851</td>\n",
       "      <td>0.135117</td>\n",
       "      <td>0.400956</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Prominent</th>\n",
       "      <th>Firbank, [Arthur Annesley] Ronald</th>\n",
       "      <td>0.245509</td>\n",
       "      <td>0.098186</td>\n",
       "      <td>-0.022528</td>\n",
       "      <td>0.300211</td>\n",
       "      <td>0.019038</td>\n",
       "      <td>0.401138</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Hathi</th>\n",
       "      <th>Treble, Henry Arthur.</th>\n",
       "      <td>-0.210674</td>\n",
       "      <td>-0.006646</td>\n",
       "      <td>0.241200</td>\n",
       "      <td>-0.130371</td>\n",
       "      <td>-0.218621</td>\n",
       "      <td>0.409141</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             intl_frac  occurs_100k  \\\n",
       "Hathi     Keable, Robert,                     0.216450    -0.034379   \n",
       "          Stern, G. B. (Gladys Bronwyn),     -0.139110    -0.103849   \n",
       "          Hutchison, William G.,              0.020145     0.164111   \n",
       "          Montague, C. E. (Charles Edward),  -0.008974    -0.167409   \n",
       "          Firbank, Ronald,                    0.163647     0.208781   \n",
       "          Somerville, E. Œ.                   0.130332    -0.004447   \n",
       "          Treble, H. A.                      -0.216614    -0.007589   \n",
       "          Corner, Caroline.                   0.365959     0.044048   \n",
       "Prominent Firbank, [Arthur Annesley] Ronald   0.245509     0.098186   \n",
       "Hathi     Treble, Henry Arthur.              -0.210674    -0.006646   \n",
       "\n",
       "                                             world_subcity  nongb_subcountry  \\\n",
       "Hathi     Keable, Robert,                        -0.076337         -0.029146   \n",
       "          Stern, G. B. (Gladys Bronwyn),          0.223005          0.115438   \n",
       "          Hutchison, William G.,                 -0.189560          0.086310   \n",
       "          Montague, C. E. (Charles Edward),      -0.190450         -0.142094   \n",
       "          Firbank, Ronald,                       -0.023689          0.197401   \n",
       "          Somerville, E. Œ.                       0.286396          0.071137   \n",
       "          Treble, H. A.                           0.084707         -0.124838   \n",
       "          Corner, Caroline.                      -0.074586         -0.032851   \n",
       "Prominent Firbank, [Arthur Annesley] Ronald      -0.022528          0.300211   \n",
       "Hathi     Treble, Henry Arthur.                   0.241200         -0.130371   \n",
       "\n",
       "                                             london_frac  distance  volumes  \n",
       "Hathi     Keable, Robert,                      -0.069466  0.243997        6  \n",
       "          Stern, G. B. (Gladys Bronwyn),        0.017939  0.305803       11  \n",
       "          Hutchison, William G.,               -0.159519  0.310108        1  \n",
       "          Montague, C. E. (Charles Edward),     0.111492  0.311446        1  \n",
       "          Firbank, Ronald,                      0.007451  0.331593        9  \n",
       "          Somerville, E. Œ.                    -0.101207  0.338130       11  \n",
       "          Treble, H. A.                        -0.223064  0.345683        1  \n",
       "          Corner, Caroline.                     0.135117  0.400956        1  \n",
       "Prominent Firbank, [Arthur Annesley] Ronald     0.019038  0.401138       10  \n",
       "Hathi     Treble, Henry Arthur.                -0.218621  0.409141        1  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Same data, but z-scores rather than raw numbers\n",
    "#  Note that distance is raw, not z-score\n",
    "typ_zscores = pd.concat(\n",
    "    [\n",
    "        pd.DataFrame(typ_scaled), \n",
    "        pd.DataFrame(distances), \n",
    "        pd.DataFrame(bookcounts)\n",
    "    ], \n",
    "    axis=1\n",
    ")\n",
    "typ_zscores.columns = [\n",
    "    'intl_frac', \n",
    "    'occurs_100k', \n",
    "    'world_subcity', \n",
    "    'nongb_subcountry', \n",
    "    'london_frac', \n",
    "    'distance', \n",
    "    'volumes'\n",
    "]\n",
    "typ_zscores.index = [corpus_list, author_list]\n",
    "typ_zscores.sort_values('distance', inplace=True)\n",
    "typ_zscores.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1) Sum of variance explained: 0.327104421445\n",
      "(2) Sum of variance explained: 0.533593304992\n",
      "Loadings:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>intl_frac</th>\n",
       "      <td>0.278234</td>\n",
       "      <td>-0.603692</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>occurs_100k</th>\n",
       "      <td>0.446459</td>\n",
       "      <td>-0.378884</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>world_subcity</th>\n",
       "      <td>-0.597458</td>\n",
       "      <td>-0.262163</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>nongb_subcountry</th>\n",
       "      <td>-0.501829</td>\n",
       "      <td>-0.570508</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>london_frac</th>\n",
       "      <td>-0.338335</td>\n",
       "      <td>0.312721</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         0         1\n",
       "intl_frac         0.278234 -0.603692\n",
       "occurs_100k       0.446459 -0.378884\n",
       "world_subcity    -0.597458 -0.262163\n",
       "nongb_subcountry -0.501829 -0.570508\n",
       "london_frac      -0.338335  0.312721"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Reduce dimensionality\n",
    "num_dims = 2\n",
    "pca = PCA(n_components=num_dims)\n",
    "compute_reduced = pca.fit_transform(typ_scaled)\n",
    "for i in range(1,num_dims+1):\n",
    "    pctvar = sum(pca.explained_variance_ratio_[0:i])\n",
    "    print(\"(%s) Sum of variance explained: %s\" % (i, pctvar))\n",
    "# Loadings\n",
    "# Get PC loadings\n",
    "loadings = pca.components_\n",
    "loadings_df = pd.DataFrame(loadings.T, index=typ_zscores.columns[:5])\n",
    "\n",
    "# show top n load items for first 10 PCs\n",
    "print(\"Loadings:\")\n",
    "display(loadings_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "plotdata = pd.DataFrame(compute_reduced, index=[corpus_list, author_list])\n",
    "plotdata.columns=['PC1', 'PC2']\n",
    "plotdata = plotdata.merge(typ_zscores, how='left', left_index=True, right_index=True)\n",
    "plotdata.index = plotdata.index.set_names(['corpus', 'author'])\n",
    "plotdata.reset_index(inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>corpus</th>\n",
       "      <th>author</th>\n",
       "      <th>PC1</th>\n",
       "      <th>PC2</th>\n",
       "      <th>intl_frac</th>\n",
       "      <th>occurs_100k</th>\n",
       "      <th>world_subcity</th>\n",
       "      <th>nongb_subcountry</th>\n",
       "      <th>london_frac</th>\n",
       "      <th>distance</th>\n",
       "      <th>volumes</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Foreign</td>\n",
       "      <td>Ali, Ahmed</td>\n",
       "      <td>0.101855</td>\n",
       "      <td>-0.904247</td>\n",
       "      <td>1.139574</td>\n",
       "      <td>-0.458467</td>\n",
       "      <td>-0.329109</td>\n",
       "      <td>0.645582</td>\n",
       "      <td>-0.345268</td>\n",
       "      <td>1.467351</td>\n",
       "      <td>1</td>\n",
       "      <td>Ali, Ahmed (1 vols) 1.47&lt;br&gt;International %: 1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Foreign</td>\n",
       "      <td>Ali, Mohomed Duse</td>\n",
       "      <td>5.064223</td>\n",
       "      <td>-1.153943</td>\n",
       "      <td>0.534191</td>\n",
       "      <td>5.582349</td>\n",
       "      <td>-1.458856</td>\n",
       "      <td>-2.257826</td>\n",
       "      <td>-1.237389</td>\n",
       "      <td>6.340753</td>\n",
       "      <td>1</td>\n",
       "      <td>Ali, Mohomed Duse (1 vols) 6.34&lt;br&gt;Internation...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Foreign</td>\n",
       "      <td>Anand, Mulk Raj</td>\n",
       "      <td>0.715623</td>\n",
       "      <td>-0.144124</td>\n",
       "      <td>0.792578</td>\n",
       "      <td>0.149212</td>\n",
       "      <td>0.013065</td>\n",
       "      <td>-0.771084</td>\n",
       "      <td>-0.145822</td>\n",
       "      <td>1.125367</td>\n",
       "      <td>10</td>\n",
       "      <td>Anand, Mulk Raj (10 vols) 1.13&lt;br&gt;Internationa...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Foreign</td>\n",
       "      <td>Azikiwe, Benjamin Nnamdi</td>\n",
       "      <td>3.640428</td>\n",
       "      <td>-0.887768</td>\n",
       "      <td>0.988956</td>\n",
       "      <td>3.648904</td>\n",
       "      <td>-0.412990</td>\n",
       "      <td>-2.281802</td>\n",
       "      <td>-1.017806</td>\n",
       "      <td>4.550346</td>\n",
       "      <td>1</td>\n",
       "      <td>Azikiwe, Benjamin Nnamdi (1 vols) 4.55&lt;br&gt;Inte...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Foreign</td>\n",
       "      <td>Baijnath, Lala</td>\n",
       "      <td>3.685660</td>\n",
       "      <td>0.219612</td>\n",
       "      <td>-0.513710</td>\n",
       "      <td>4.237920</td>\n",
       "      <td>-0.958198</td>\n",
       "      <td>-2.440915</td>\n",
       "      <td>-0.411214</td>\n",
       "      <td>5.026845</td>\n",
       "      <td>1</td>\n",
       "      <td>Baijnath, Lala (1 vols) 5.03&lt;br&gt;International ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    corpus                    author       PC1       PC2  intl_frac  \\\n",
       "0  Foreign                Ali, Ahmed  0.101855 -0.904247   1.139574   \n",
       "1  Foreign         Ali, Mohomed Duse  5.064223 -1.153943   0.534191   \n",
       "2  Foreign           Anand, Mulk Raj  0.715623 -0.144124   0.792578   \n",
       "3  Foreign  Azikiwe, Benjamin Nnamdi  3.640428 -0.887768   0.988956   \n",
       "4  Foreign            Baijnath, Lala  3.685660  0.219612  -0.513710   \n",
       "\n",
       "   occurs_100k  world_subcity  nongb_subcountry  london_frac  distance  \\\n",
       "0    -0.458467      -0.329109          0.645582    -0.345268  1.467351   \n",
       "1     5.582349      -1.458856         -2.257826    -1.237389  6.340753   \n",
       "2     0.149212       0.013065         -0.771084    -0.145822  1.125367   \n",
       "3     3.648904      -0.412990         -2.281802    -1.017806  4.550346   \n",
       "4     4.237920      -0.958198         -2.440915    -0.411214  5.026845   \n",
       "\n",
       "   volumes                                              label  \n",
       "0        1  Ali, Ahmed (1 vols) 1.47<br>International %: 1...  \n",
       "1        1  Ali, Mohomed Duse (1 vols) 6.34<br>Internation...  \n",
       "2       10  Anand, Mulk Raj (10 vols) 1.13<br>Internationa...  \n",
       "3        1  Azikiwe, Benjamin Nnamdi (1 vols) 4.55<br>Inte...  \n",
       "4        1  Baijnath, Lala (1 vols) 5.03<br>International ...  "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "plotdata['label'] = plotdata.author + ' (' + \\\n",
    "                    plotdata.volumes.apply(str) + ' vols) ' + \\\n",
    "                    round(plotdata.distance, 2).apply(str) + '<br>' + \\\n",
    "                    'International %: ' + round(plotdata.intl_frac, 2).apply(str) + '<br>' + \\\n",
    "                    'Mentions/100k: ' + round(plotdata.occurs_100k, 2).apply(str) + '<br>' + \\\n",
    "                    'Global subcity %: ' + round(plotdata.world_subcity, 2).apply(str) + '<br>' + \\\n",
    "                    'Non-GB subcountry %: ' + round(plotdata.nongb_subcountry, 2).apply(str) + '<br>' + \\\n",
    "                    'London/GB %: ' + round(plotdata.london_frac, 2).apply(str)\n",
    "plotdata.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\" seamless=\"seamless\" src=\"https://plot.ly/~mattwilkens/119.embed\" height=\"525px\" width=\"100%\"></iframe>"
      ],
      "text/plain": [
       "<plotly.tools.PlotlyDisplay object>"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Interactive plot\n",
    "l = []\n",
    "N = plotdata.corpus.nunique()\n",
    "biggest_bubble = plotdata.volumes.max()\n",
    "\n",
    "for c in corpora:\n",
    "    if c == 'Hathi':\n",
    "        opac = 0.02\n",
    "    else:\n",
    "        opac = 0.4\n",
    "    labels = plotdata[plotdata.corpus == c].label.tolist()\n",
    "    bookcounts = plotdata[plotdata.corpus == c].volumes.tolist()\n",
    "    trace0= go.Scattergl(\n",
    "        x= plotdata[plotdata.corpus == c]['PC1'],\n",
    "        y= plotdata[plotdata.corpus == c]['PC2'],\n",
    "        mode= 'markers',\n",
    "        marker= dict(size= bookcounts,\n",
    "                     sizemode='area',\n",
    "                     sizeref=2.*biggest_bubble/(75.**2),\n",
    "                     line= dict(width=0),\n",
    "                     color= colors[c],\n",
    "                     opacity= opac\n",
    "                   ),name= c,\n",
    "        text= labels # The hover text goes here... \n",
    "    )\n",
    "    l.append(trace0);\n",
    "\n",
    "layout= go.Layout(\n",
    "    title= 'British literary geography, 1880-1940',\n",
    "    hovermode= 'closest',\n",
    "    xaxis= dict(\n",
    "        title= \"PC1: more international, more intensive, less specific, less London →\",\n",
    "        ticklen= 5,\n",
    "        zeroline= False,\n",
    "        gridwidth= 2,\n",
    "    ),\n",
    "    yaxis=dict(\n",
    "        title= \"PC2: more London, less international, less intensive, less specific →\",\n",
    "        ticklen= 5,\n",
    "        gridwidth= 2,\n",
    "    ),\n",
    "    showlegend= True\n",
    ")\n",
    "fig= go.Figure(data=l, layout=layout)\n",
    "py.iplot(fig, filename='geotyp multi')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "py.image.save_as(fig, filename=os.path.join(figDir, 'geotyp.png'), scale=10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Online visualization\n",
    "\n",
    "See the [full interactive](https://plot.ly/~mattwilkens/119/british-literary-geography-1880-1940/#/) online if it isn't rendered above."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
